rm(list=ls())
# Get current date
<- Sys.Date()
today print(today)
[1] "2024-03-19"
# Creating a Date object
<- as.Date("2023-12-11")
specific_date print(specific_date)
[1] "2023-12-11"
In this practical, we will develop some basic skills and techniques in working with date- and time-based data.
This will help prepare us for further work in exploratory time-series analysis.
Base R uses two main classes to handle date and time data:
Date
: for dates (year, month, day).
POSIXct
and POSIXlt
: for date-time (date plus time of day).
In computer programming, a “class” is a blueprint or template for creating objects, providing initial values for state (member variables or attributes) and implementations of behavior (member functions or methods).
Classes encapsulate data for the objects created from them, enabling the principle of data abstraction and encapsulation. This concept allows for the creation of complex data structures that can model real-world entities or abstract concepts, facilitating object-oriented programming (OOP).
Classes define the properties and functionalities that their instances (objects) will have, allowing for code reuse and modularity.
Let’s start by understanding how to work with these two classes.
Date
classThe Date
class is simplest, and is used to handle dates without time.
rm(list=ls())
# Get current date
<- Sys.Date()
today print(today)
[1] "2024-03-19"
# Creating a Date object
<- as.Date("2023-12-11")
specific_date print(specific_date)
[1] "2023-12-11"
POSIXct
and POSIXlt
are used for data that includes both date and time information.
POSIXct
represents the (date-time) as the number of seconds since the beginning of 1970 (known as the Unix epoch), whereas POSIXlt
is a list that contains detailed information about the date-time.
Note that Excel uses a different time-base for its date and time serial information. 693,960.
# Current date-time
<- Sys.time()
now print(now)
[1] "2024-03-19 15:27:47 GMT"
# Creating a POSIXct object
<- as.POSIXct("2023-12-11 20:59:59")
specific_datetime print(specific_datetime)
[1] "2023-12-11 20:59:59 GMT"
Using as.Date()
, you can convert character strings to Date
objects. In this example, I’ve specified the format in which the date information is presented (“%Y-%m-%d”):
# Convert a character string (2023-12-11) to a Date
<- as.Date("2023-12-11", format="%Y-%m-%d")
date_from_string print(date_from_string)
[1] "2023-12-11"
The important point to remember is that R now understands that this object represents time-based information, rather than just a string of characters. It’s similar to how we defined certain strings as factors in previous sections of the module.
Using as.POSIXct()
, we can also work with date-time strings.
# Convert a character string to POSIXct
<- as.POSIXct("2022-01-01 12:00:00", format="%Y-%m-%d %H:%M:%S")
datetime_from_string print(datetime_from_string)
[1] "2022-01-01 12:00:00 GMT"
Dates and times can come in various formats. I’ve lost track of the number of different ways in which I’ve seen date and time-based information represented in sport-related datasets.
Regardless of the format in which we recieve the data, it’s crucial to match the format in the as.Date()
or as.POSIXct()
functions.
# Different date formats
<- as.Date("01/02/2022", format="%d/%m/%Y") # Day/Month/Year
date_euro_format print(date_euro_format)
[1] "2022-02-01"
# Time in 12-hour format
<- as.POSIXct("01/02/2022 01:30:00 PM", format="%d/%m/%Y %I:%M:%S %p")
datetime_12hr print(datetime_12hr)
[1] "2022-02-01 13:30:00 GMT"
In the following code, I’ll create an example dataset that contains the type of ‘messy’ date and time information we often find in sport data:
# Dataset with date and time in different formats
<- data.frame(
date_time_data date_string = c("2023-12-19", "19-Dec-2023", "12/19/2023", "20231219",
"2023/12/19 14:20", "19-Dec-2023 14:20", "12/19/2023 14:20", "202312191420"),
format = c("YYYY-MM-DD", "DD-MMM-YYYY", "MM/DD/YYYY", "YYYYMMDD",
"YYYY/MM/DD HH:MM", "DD-MMM-YYYY HH:MM", "MM/DD/YYYY HH:MM", "YYYYMMDDHHMM")
)
# Show original dataset
print("Original Dataset with Various Date Formats")
[1] "Original Dataset with Various Date Formats"
print(date_time_data)
date_string format
1 2023-12-19 YYYY-MM-DD
2 19-Dec-2023 DD-MMM-YYYY
3 12/19/2023 MM/DD/YYYY
4 20231219 YYYYMMDD
5 2023/12/19 14:20 YYYY/MM/DD HH:MM
6 19-Dec-2023 14:20 DD-MMM-YYYY HH:MM
7 12/19/2023 14:20 MM/DD/YYYY HH:MM
8 202312191420 YYYYMMDDHHMM
Now, I can use the same process to convert that information to a format that R will understand. Note that I have to tell R what the format of my original data is:
# Convert date strings to Date objects using as.Date()
$date_as_date <- c(
date_time_dataas.Date(date_time_data$date_string[1], format = "%Y-%m-%d"),
as.Date(date_time_data$date_string[2], format = "%d-%b-%Y"),
as.Date(date_time_data$date_string[3], format = "%m/%d/%Y"),
as.Date(date_time_data$date_string[4], format = "%Y%m%d"),
as.Date(date_time_data$date_string[5], format = "%Y/%m/%d"),
as.Date(date_time_data$date_string[6], format = "%d-%b-%Y"),
as.Date(date_time_data$date_string[7], format = "%m/%d/%Y"),
as.Date(date_time_data$date_string[8], format = "%Y%m%d")
)
# Convert date strings to POSIXct datetime objects using as.POSIXct()
$datetime_as_posix <- c(
date_time_dataas.POSIXct(date_time_data$date_string[1], format = "%Y-%m-%d"),
as.POSIXct(date_time_data$date_string[2], format = "%d-%b-%Y"),
as.POSIXct(date_time_data$date_string[3], format = "%m/%d/%Y"),
as.POSIXct(date_time_data$date_string[4], format = "%Y%m%d"),
as.POSIXct(date_time_data$date_string[5], format = "%Y/%m/%d %H:%M"),
as.POSIXct(date_time_data$date_string[6], format = "%d-%b-%Y %H:%M"),
as.POSIXct(date_time_data$date_string[7], format = "%m/%d/%Y %H:%M"),
as.POSIXct(date_time_data$date_string[8], format = "%Y%m%d%H%M")
)
# Show the dataset with converted date and datetime columns
print("Dataset with Converted Date and DateTime Columns")
[1] "Dataset with Converted Date and DateTime Columns"
print(date_time_data)
date_string format date_as_date datetime_as_posix
1 2023-12-19 YYYY-MM-DD 2023-12-19 2023-12-19 00:00:00
2 19-Dec-2023 DD-MMM-YYYY 2023-12-19 2023-12-19 00:00:00
3 12/19/2023 MM/DD/YYYY 2023-12-19 2023-12-19 00:00:00
4 20231219 YYYYMMDD 2023-12-19 2023-12-19 00:00:00
5 2023/12/19 14:20 YYYY/MM/DD HH:MM 2023-12-19 2023-12-19 14:20:00
6 19-Dec-2023 14:20 DD-MMM-YYYY HH:MM 2023-12-19 2023-12-19 14:20:00
7 12/19/2023 14:20 MM/DD/YYYY HH:MM 2023-12-19 2023-12-19 14:20:00
8 202312191420 YYYYMMDDHHMM 2023-12-19 2023-12-19 14:20:00
Sometimes you might want to extract the year, month, day etc., from an existing variable.
This can be done as follows:
<- as.POSIXct("2023-12-11 20:59:59")
specific_datetime
# Extracting components
<- format(specific_datetime, "%Y")
year <- format(specific_datetime, "%m")
month <- format(specific_datetime, "%d")
day <- format(specific_datetime, "%H")
hour <- format(specific_datetime, "%M")
minutes <- format(specific_datetime, "%S")
seconds
print(paste("Year:", year, "- Month:", month, "- Day:", day, "- Hour:", hour, "- Minutes:", minutes, "- Seconds:", seconds))
[1] "Year: 2023 - Month: 12 - Day: 11 - Hour: 20 - Minutes: 59 - Seconds: 59"
For further analysis, we might wish to modify or extract elements from our time-based data.
Some examples include:
# Adding days to a date
<- specific_date + 30
future_date print(future_date)
[1] "2024-01-10"
# Subtracting time from a datetime
<- specific_datetime - as.difftime(1, units="hours")
past_datetime print(past_datetime)
[1] "2023-12-11 19:59:59 GMT"
# Difference in days
<- as.Date("2022-02-01") - as.Date("2022-01-01")
date_diff print(date_diff)
Time difference of 31 days
# Difference in seconds
<- as.POSIXct("2022-01-01 13:00:00") - as.POSIXct("2022-01-01 12:00:00")
time_diff print(as.numeric(time_diff, units="secs"))
[1] 3600
Handling time zones in POSIXct
is a critical aspect of date-time manipulation.
This is particularly important if you’re working with data gathered from different countries.
# Creating a POSIXct object with a specific time zone
<- as.POSIXct("2023-01-01 12:00:00", tz="America/New_York")
datetime_ny <- as.POSIXct("2023-01-01 12:00:00", tz="Europe/London")
datetime_london
# Comparing times
print(datetime_ny)
[1] "2023-01-01 12:00:00 EST"
print(datetime_london)
[1] "2023-01-01 12:00:00 GMT"
lubridate
packageSo far, we’ve focused on the two time-based functions that come with base R. Now, we will add the lubridate
package to our toolkit.
The lubridate
package is a powerful and user-friendly tool designed to simplify the handling and manipulation of dates and times.
As part of the tidyverse
, it provides a comprehensive set of functions that make it easier to perform common tasks such as parsing, manipulating, and doing arithmetic with date-time objects.
lubridate
addresses the complexity of date-time data types by offering functions that intuitively deal with time zones, daylight saving times, and various date-time formats.
It’s important to note that its functionality is built around three main date-time classes: dates, times (POSIXct and POSIXlt), and durations, intervals, or periods.
library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
# Easy parsing of dates
ymd("20220101")
[1] "2022-01-01"
mdy("01/02/2022")
[1] "2022-01-02"
dmy("02-01-2022")
[1] "2022-01-02"
# Arithmetic with lubridate
<- ymd("2022-01-01")
date1 %m+% months(1) # Add a month date1
[1] "2022-02-01"
%m-% months(1) # Subtract a month date1
[1] "2021-12-01"
# Extracting components
year(date1)
[1] 2022
month(date1)
[1] 1
day(date1)
[1] 1
# Rounding off date and time to the nearest day, hour, etc.
round_date(datetime_ny, unit="day")
[1] "2023-01-02 EST"
floor_date(datetime_ny, unit="hour")
[1] "2023-01-01 12:00:00 EST"
ceiling_date(datetime_ny, unit="minute")
[1] "2023-01-01 12:00:00 EST"
# Dealing with duration and period - understanding the difference between duration (exact time spans) and period (human-readable time spans).
# Duration: exact time spans
<- ddays(1)
duration_one_day <- dhours(1)
duration_one_hour + duration_one_day datetime_ny
[1] "2023-01-02 12:00:00 EST"
# Period: human-readable time spans
<- months(1)
period_one_month + period_one_month date1
[1] "2022-02-01"
# Handling Daylight Saving Time Dealing with complexities due to changes in daylight saving time.
# Before daylight saving time
<- as.POSIXct("2022-03-13 01:59:59", tz="America/New_York")
dt1
# After daylight saving time
<- dt1 + dhours(1)
dt2
print(dt1)
[1] "2022-03-13 01:59:59 EST"
print(dt2)
[1] "2022-03-13 03:59:59 EDT"